Let’s triangulate marker genes for each of the lineages that we find enrichment in. We have three sources of information:
Let’s take the Intersection where we take the intersect of loosely filtered results multiple approaches
These genes will be used as a feature space for manual pseudobulk-based regression
N.B. these are lineages derived from consensus clustering on NMF weights This dimensionality reduction approach actually lead to cleaner clustering of celltypes than just clustering celltypes by correlation
library(tidyverse)
library(ggpubr)
Code from AUCell for adaptive thresholding
library(AUCell)
### Helper function for AUC thresholding from AUCell
get_Threshold <- function (auc, gSetName, plotHist = TRUE, smallestPopPercent = 0.25,
densAdjust = 2, thrP = 0.01, nBreaks = 100)
{
aucRow <- t(as.matrix(auc))
#gSetName <- rownames(aucRow)[1]
nCells <- length(auc)
skipGlobal <- TRUE
skipRed <- FALSE
skipSmallDens <- FALSE
commentMsg <- ""
aucThrs <- c()
notPopPercent <- 1 - smallestPopPercent
if (sum(auc == 0) > (nCells * notPopPercent)) {
skipGlobal <- FALSE
commentMsg <- paste(commentMsg, round((sum(auc == 0)/nCells) *
100), "% (more than ", notPopPercent, "%) of AUC are zero. ",
sep = "")
}
meanAUC <- mean(auc)
sdAUC <- sd(auc)
maybeNormalDistr <- !suppressWarnings(ks.test(auc, rnorm(max(100,
length(auc)), mean = meanAUC, sd = sdAUC), alternative = "less")$p.value <
0.01)
if (maybeNormalDistr) {
commentMsg <- paste0(commentMsg, "The AUC might follow a normal distribution (random gene-set?). ")
skipGlobal <- FALSE
aucThrs["outlierOfGlobal"] <- qnorm(1 - (thrP/nCells),
mean = meanAUC, sd = sdAUC)
}
histogram <- hist(c(0, auc/max(auc)), breaks = 100, plot = FALSE)$count
if ((sum(histogram[1:5])/sum(histogram)) >= notPopPercent *
0.75) {
skipGlobal <- FALSE
skipRed <- TRUE
skipSmallDens <- TRUE
}
if ((sum(histogram[1:10])/sum(histogram)) >= notPopPercent *
0.5) {
skipSmallDens <- TRUE
skipGlobal <- FALSE
aucThrs["tenPercentOfMax"] <- max(auc) * 0.1
}
densCurve <- density(auc, adjust = densAdjust, cut = 0)
maximumsDens <- NULL
inflPoints <- diff(sign(diff(densCurve$y)))
maximumsDens <- which(inflPoints == -2)
globalMax <- maximumsDens[which.max(densCurve$y[maximumsDens])]
minimumDens <- which(inflPoints == 2)
smallMin <- NULL
if (!skipSmallDens)
smallMin <- data.table::last(minimumDens[which(minimumDens <
globalMax)])
minimumDens <- c(smallMin, minimumDens[which(minimumDens >
globalMax)])
densTrh <- NULL
if (length(minimumDens) > 0) {
densTrh <- densCurve$x[min(minimumDens)]
if (length(maximumsDens) > 0) {
nextMaxs <- maximumsDens[which(densCurve$x[maximumsDens] >
densTrh)]
if ((max(densCurve$y[nextMaxs])/max(densCurve$y)) <
0.05) {
densTrh <- NULL
}
}
}
auc <- sort(auc)
distrs <- list()
distrs[["Global_k1"]] <- list(mu = c(meanAUC, NA), sigma = c(sdAUC,
NA), x = auc)
if ("mixtools" %in% rownames(installed.packages())) {
na <- capture.output(distrs[["k2"]] <- tryCatch(mixtools::normalmixEM(auc,
fast = FALSE, k = 2, verb = FALSE), error = function(e) {
return(NULL)
}))
na <- capture.output(distrs[["k3"]] <- tryCatch(mixtools::normalmixEM(auc,
fast = FALSE, k = 3, verb = FALSE), error = function(e) {
return(NULL)
}))
if (is.null(distrs[["k2"]]) && is.null(distrs[["k3"]])) {
if (sum(auc == 0) < (nCells * notPopPercent * 0.5))
skipGlobal <- FALSE
}
if (!is.null(distrs[["k2"]])) {
compL <- which.min(distrs[["k2"]][["mu"]])
compR <- which.max(distrs[["k2"]][["mu"]])
height1 <- 0.4/distrs[["k2"]][["sigma"]][compL] *
distrs[["k2"]][["lambda"]][compL]
height2 <- 0.4/distrs[["k2"]][["sigma"]][compR] *
distrs[["k2"]][["lambda"]][compR]
taller <- height1 < height2
globalInclInFirst <- (distrs[["Global_k1"]]$mu[1] <
(distrs[["k2"]][["mu"]][compL] + (1.5 * distrs[["k2"]][["sigma"]][compL])))
includedInGlobal <- ((distrs[["k2"]][["mu"]][compL] >
(distrs[["Global_k1"]]$mu[1] - distrs[["Global_k1"]]$sigma[1])) &&
(distrs[["k2"]][["mu"]][compR] < (distrs[["Global_k1"]]$mu[1] +
distrs[["Global_k1"]]$sigma[1])))
if (taller || (globalInclInFirst && includedInGlobal)) {
skipGlobal <- FALSE
if (globalInclInFirst && includedInGlobal)
commentMsg <- paste(commentMsg, "The global distribution overlaps the partial distributions. ")
if (taller && !includedInGlobal)
commentMsg <- paste(commentMsg, "The right distribution is taller. ")
}
}
}
else {
warning("Package 'mixtools' is not available to calculate the sub-distributions.")
}
glProb <- 1 - (thrP/nCells + smallestPopPercent)
aucThrs["Global_k1"] <- qnorm(glProb, mean = distrs[["Global_k1"]][["mu"]][1],
sd = distrs[["Global_k1"]][["sigma"]][1])
if (!is.null(distrs[["k2"]])) {
k2_L <- which.min(distrs[["k2"]][["mu"]])
aucThrs["L_k2"] <- qnorm(1 - (thrP/nCells), mean = distrs[["k2"]][["mu"]][k2_L],
sd = distrs[["k2"]][["sigma"]][k2_L])
}
if (!is.null(distrs[["k3"]])) {
k3_R <- which.max(distrs[["k3"]][["mu"]])
k3_R_threshold <- qnorm(thrP, mean = distrs[["k3"]][["mu"]][k3_R],
sd = distrs[["k3"]][["sigma"]][k3_R])
if (k3_R_threshold > 0)
aucThrs["R_k3"] <- k3_R_threshold
}
if (!is.null(densTrh)) {
aucThrs["minimumDens"] <- densTrh
}
aucThr <- aucThrs
if (skipGlobal)
aucThr <- aucThrs[which(!names(aucThrs) %in% "Global_k1")]
if (skipRed)
aucThr <- aucThrs[which(!names(aucThrs) %in% "L_k2")]
aucThr <- aucThr[which.max(aucThr)]
if ((length(aucThr) > 0) && (names(aucThr) == "minimumDens")) {
maximumsDens <- maximumsDens[which(densCurve$y[maximumsDens] >
1)]
if (length(maximumsDens) > 2) {
tmp <- cbind(minimumDens[seq_len(length(maximumsDens) -
1)], maximumsDens[-1])
FCs <- densCurve$y[tmp[, 2]]/densCurve$y[tmp[, 1]]
if (any(FCs > 1.5))
warning(gSetName, ":\tCheck the AUC histogram. ",
"'minimumDens' was selected as the best threshold, ",
"but there might be several distributions in the AUC.")
}
}
if ("minimumDens" %in% names(aucThrs))
aucThr <- aucThrs["minimumDens"]
if (length(aucThr) == 0)
aucThr <- aucThrs[which.max(aucThrs)]
if (length(aucThr) == 0)
aucThr <- 1
if (length(aucThr) > 1)
aucThr <- unlist(aucThr[which.max(aucThr)])
if (plotHist) {
histInfo <- AUCell_plotHist(aucRow, aucThr = aucThr,
nBreaks = nBreaks)
histMax <- max(histInfo[[gSetName]]$counts)
densCurve$y <- densCurve$y * (histMax/max(densCurve$y))
thisLwd <- ifelse((aucThrs["minimumDens"] == aucThr) &&
(!is.null(aucThr) && !is.null(aucThrs["minimumDens"])),
3, 1)
lines(densCurve, lty = 1, lwd = thisLwd, col = "blue")
if (!is.null(minimumDens))
points(densCurve$x[minimumDens], densCurve$y[minimumDens],
pch = 16, col = "darkblue")
scalFact <- 1
aucDistr <- dnorm(distrs[["Global_k1"]][["x"]], mean = distrs[["Global_k1"]][["mu"]][1],
sd = distrs[["Global_k1"]][["sigma"]][1])
scalFact <- (histMax/max(aucDistr)) * 0.95
thisLwd <- ifelse(aucThrs["Global_k1"] == aucThr, 3,
1)
lines(distrs[["Global_k1"]][["x"]], scalFact * aucDistr,
col = "darkgrey", lwd = thisLwd, lty = 2)
if (!is.null(distrs[["k2"]])) {
aucDistr <- dnorm(distrs[["k2"]][["x"]], mean = distrs[["k2"]][["mu"]][k2_L],
sd = distrs[["k2"]][["sigma"]][k2_L])
scalFact <- (histMax/max(aucDistr)) * 0.95
thisLwd <- ifelse(aucThrs["k2"] == aucThr, 3, 1)
lines(distrs[["k2"]][["x"]], scalFact * aucDistr,
col = "red", lwd = thisLwd, lty = 2)
rect(distrs[["k2"]][["mu"]][k2_L] - distrs[["k2"]][["sigma"]][k2_L],
histMax - (histMax * 0.02), distrs[["k2"]][["mu"]][k2_L] +
distrs[["k2"]][["sigma"]][k2_L], histMax, col = "#70000030",
border = "#00009000")
}
if ((!is.null(distrs[["k3"]])) && ("R_k3" %in% names(aucThrs))) {
k3_L <- which.min(distrs[["k3"]][["mu"]])
aucDistr2 <- dnorm(distrs[["k3"]][["x"]], mean = distrs[["k3"]][["mu"]][k3_R],
sd = distrs[["k3"]][["sigma"]][k3_R])
scalFact2 <- scalFact * (distrs[["k3"]][["lambda"]][k3_R]/distrs[["k3"]][["lambda"]][k3_L])
thisLwd <- ifelse(aucThrs["k3"] == aucThr, 3, 1)
lines(distrs[["k3"]][["x"]], scalFact2 * aucDistr2,
col = "magenta", lwd = thisLwd, lty = 2)
rect(distrs[["k3"]][["mu"]][k3_R] - distrs[["k3"]][["sigma"]][k3_R],
histMax - (histMax * 0.02), distrs[["k3"]][["mu"]][k3_R] +
distrs[["k3"]][["sigma"]][k3_R], histMax, col = "#80808030",
border = "#80808030")
}
aucThrs <- aucThrs[!is.na(aucThrs)]
if (length(aucThrs) > 0) {
pars <- list()
pars[["Global_k1"]] <- c(col1 = "#909090", col2 = "black",
pos = 0.9)
pars[["L_k2"]] <- c(col1 = "red", col2 = "darkred",
pos = 0.8)
pars[["R_k3"]] <- c(col1 = "magenta", col2 = "magenta",
pos = 0.6)
pars[["minimumDens"]] <- c(col1 = "blue", col2 = "darkblue",
pos = 0.4)
pars[["tenPercentOfMax"]] <- c(col1 = "darkgreen",
col2 = "darkgreen", pos = 0.9)
pars[["outlierOfGlobal"]] <- c(col1 = "darkgreen",
col2 = "darkgreen", pos = 0.9)
for (thr in names(aucThrs)) {
thisLwd <- ifelse(aucThrs[thr] == aucThr, 5,
2)
thisLty <- ifelse(aucThrs[thr] == aucThr, 1,
3)
abline(v = aucThrs[thr], col = pars[[thr]][1],
lwd = thisLwd, lty = thisLty)
xPos <- aucThrs[thr] * 1.01
if (aucThrs[thr] > (max(auc) * 0.8))
xPos <- 0
if (aucThrs[thr] == aucThr)
text(xPos, histMax * as.numeric(pars[[thr]][3]),
pos = 4, col = pars[[thr]][2], cex = 0.8,
paste("AUC > ", signif(aucThrs[thr], 2),
"\n(", sum(auc > aucThrs[thr]), " cells)",
sep = ""))
}
}
}
return(list(selected = aucThr, thresholds = cbind(threshold = aucThrs,
nCells = sapply(aucThrs, function(x) sum(auc > x))),
comment = commentMsg))
}
library(Seurat)
Attaching SeuratObject
library(tidyverse)
Lineage_DE <- read_csv("BALL_DEresults_NMF_Lineage.csv")
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
Gene = col_character(),
baseMean = col_double(),
log2FoldChange = col_double(),
lfcSE = col_double(),
stat = col_double(),
pvalue = col_double(),
padj = col_double(),
Lineage = col_character()
)
Lineage_DE
Lineage_DE %>%
filter(padj < 0.01) %>%
pull(Lineage) %>% table() %>% sort(decreasing = T)
.
Erythroid T_NK Pro_B Pre_B Mature_B Monocyte Myeloid_Progenitor MLP_CLP_PreProB Pre_pDC
12871 12507 9623 8721 7703 7186 4309 1829 826
HSC_MPP_LMPP
469
Pearson correlation VST NMF stringent Intersect on Pseudobulk DE FDR 0.05
Or LASSO regression Pearson correlation stringent VST NMF + Pseudobulk DE stringent
Set adaptive thresholds
Ths is the pearson correlation between VST-normalized gene expression
and the score for each NMF lineage
NMF_corr <- data.table::fread('NMF_gene_corr.csv') %>% select(-V1)
Warning: Detected 5 column names but the data has 6 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.
NMF_corr
NMF_corr %>%
filter(qvalue < 0.05, pearson > 0) %>%
pull(NMF) %>% table() %>% sort(decreasing = TRUE)
.
NMF4 NMF10 NMF7 NMF8 NMF2 NMF1 NMF6 NMF9 NMF3 NMF5
3085 1617 1286 1270 1143 1135 1037 935 805 636
NMF_corr_thresholds = data.frame()
for(lin in unique(NMF_corr$NMF)){
thresholds = get_Threshold(NMF_corr %>% filter(NMF == lin, qvalue < 0.05, pearson > 0) %>% pull(pearson), lin)$thresholds
NMF_corr_thresholds =
bind_rows(
NMF_corr_thresholds,
data.frame(
'NMF' = lin,
'K1_threshold' = thresholds['Global_k1','threshold'],
'K2_threshold' = thresholds['L_k2','threshold']
))
}
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Visualize thresholding within all positive correlations
NMF_corr_thresholds %>%
left_join(NMF_corr) %>% filter(pearson > 0) %>%
mutate(NMF = factor(NMF, levels = c('NMF6', 'NMF8', 'NMF2', 'NMF1', 'NMF3', 'NMF9', 'NMF4',
'NMF5', 'NMF10', 'NMF7'))) %>%
mutate(threshold = ifelse(pearson > K1_threshold, 'pass', 'fail')) %>%
ggplot(aes(x = pearson, fill = threshold)) +
geom_histogram(bins=100) + theme_pubr(legend = 'top') +
scale_fill_brewer(palette = 'Dark2', direction = -1) +
facet_wrap(.~NMF, scale = 'free', ncol=5) +
geom_vline(aes(xintercept = K1_threshold), lty=2)
Joining with `by = join_by(NMF)`Warning: Each row in `x` is expected to match at most 1 row in `y`.
Visualize thresholding within FDR < 0.05 correlations
NMF_corr_thresholds %>%
left_join(NMF_corr) %>% filter(qvalue < 0.05, pearson > 0) %>%
mutate(NMF = factor(NMF, levels = c('NMF6', 'NMF8', 'NMF2', 'NMF1', 'NMF3', 'NMF9', 'NMF4',
'NMF5', 'NMF10', 'NMF7'))) %>%
mutate(threshold = ifelse(pearson > K1_threshold, 'pass', 'fail')) %>%
ggplot(aes(x = pearson, fill = threshold)) +
geom_histogram(bins=100) + theme_pubr(legend = 'top') +
scale_fill_brewer(palette = 'Dark2', direction = -1) +
facet_wrap(.~NMF, scale = 'free', ncol=5) +
geom_vline(aes(xintercept = K1_threshold), lty=2)
Joining with `by = join_by(NMF)`Warning: Each row in `x` is expected to match at most 1 row in `y`.
NMF_corr_pos <- NMF_corr_thresholds %>%
left_join(NMF_corr) %>%
mutate(threshold = ifelse(pearson > K1_threshold, 'pass', 'fail'))
Joining with `by = join_by(NMF)`Warning: Each row in `x` is expected to match at most 1 row in `y`.
NMF_corr_pos %>%
filter(qvalue < 0.05, pearson > 0) %>%
filter(threshold == 'pass') %>%
pull(NMF) %>% table() %>% sort(decreasing = T)
.
NMF4 NMF10 NMF7 NMF8 NMF1 NMF2 NMF6 NMF3 NMF9 NMF5
698 328 282 279 262 244 200 196 156 144
NMF_corr_neg_thresholds = data.frame()
for(lin in unique(NMF_corr$NMF)){
thresholds = get_Threshold(NMF_corr %>% filter(NMF == lin, qvalue < 0.05, pearson < 0) %>% mutate(pearson = -pearson) %>% pull(pearson), lin)$thresholds
NMF_corr_neg_thresholds =
bind_rows(
NMF_corr_neg_thresholds,
data.frame(
'NMF' = lin,
'K1_threshold' = thresholds['Global_k1','threshold'],
'K2_threshold' = thresholds['L_k2','threshold']
))
}
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Warning: no non-missing arguments to max; returning -Inf
Visualize thresholding within all negative correlations
NMF_corr_neg_thresholds %>%
left_join(NMF_corr) %>% filter(pearson < 0) %>%
mutate(negative_pearson = -pearson) %>%
mutate(NMF = factor(NMF, levels = c('NMF6', 'NMF8', 'NMF2', 'NMF1', 'NMF3', 'NMF9', 'NMF4',
'NMF5', 'NMF11', 'NMF10', 'NMF7'))) %>%
mutate(threshold = ifelse(negative_pearson > K1_threshold, 'pass', 'fail')) %>%
ggplot(aes(x = negative_pearson, fill = threshold)) +
geom_histogram(bins=100) + theme_pubr(legend = 'top') +
scale_fill_brewer(palette = 'Dark2', direction = -1) +
facet_wrap(.~NMF, scale = 'free', ncol=6) +
geom_vline(aes(xintercept = K1_threshold), lty=2)
Joining with `by = join_by(NMF)`Warning: Each row in `x` is expected to match at most 1 row in `y`.
NMF_corr_neg_thresholds
Visualize thresholding within FDR < 0.05 correlations
NMF_corr_neg_thresholds %>%
left_join(NMF_corr) %>% filter(qvalue < 0.05, pearson < 0) %>%
mutate(negative_pearson = -pearson) %>%
mutate(NMF = factor(NMF, levels = c('NMF6', 'NMF8', 'NMF2', 'NMF1', 'NMF3', 'NMF9', 'NMF4',
'NMF5', 'NMF11', 'NMF10', 'NMF7'))) %>%
mutate(threshold = ifelse(negative_pearson > K1_threshold, 'pass', 'fail')) %>%
ggplot(aes(x = negative_pearson, fill = threshold)) +
geom_histogram(bins=100) + theme_pubr(legend = 'top') +
scale_fill_brewer(palette = 'Dark2', direction = -1) +
facet_wrap(.~NMF, scale = 'free', ncol=6) +
geom_vline(aes(xintercept = K1_threshold), lty=2)
Joining with `by = join_by(NMF)`Warning: Each row in `x` is expected to match at most 1 row in `y`.
NMF_corr_neg_thresholds
NMF_corr_neg <- NMF_corr_neg_thresholds %>%
left_join(NMF_corr) %>%
mutate(threshold = ifelse(pearson < -K1_threshold, 'pass', 'fail'))
Joining with `by = join_by(NMF)`Warning: Each row in `x` is expected to match at most 1 row in `y`.
NMF_corr_neg %>%
filter(qvalue < 0.05, pearson < 0) %>%
filter(threshold == 'pass') %>%
pull(NMF) %>% table() %>% sort(decreasing = T)
.
NMF4 NMF7 NMF1 NMF2 NMF10 NMF8 NMF5 NMF6 NMF9 NMF3
821 399 392 312 252 178 173 130 94 80
NMF_corr_neg %>%
filter(qvalue < 0.05, pearson < 0)
NMF_corr_thresholding <- NMF_corr_pos %>% select(NMF, Gene, pearson, pvalue, qvalue, pos_K1_threshold = K1_threshold, pos_threshold = threshold) %>%
left_join(NMF_corr_neg %>% select(NMF, Gene, neg_K1_threshold = K1_threshold, neg_threshold = threshold)) %>%
mutate(neg_K1_threshold = -neg_K1_threshold, threshold = ifelse(pos_threshold == 'pass' | neg_threshold == 'pass', 'pass', 'fail')) %>%
select(NMF, Gene, pearson, pvalue, qvalue, pos_K1_threshold, neg_K1_threshold, threshold)
Joining with `by = join_by(NMF, Gene)`
NMF_corr_thresholding
#NMF_corr_thresholding %>% write_csv("NMF_GeneCorr_Thresholding.csv")